{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from buckaroo.buckaroo_widget import BuckarooWidget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./yellow_tripdata_2021-02.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "w = BuckarooWidget(df)\n",
    "w"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adding a summary stat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "w = BuckarooWidget(df)\n",
    "from buckaroo.pluggable_analysis_framework.pluggable_analysis_framework import ColAnalysis\n",
    "from scipy.stats import skew\n",
    "class Skew(ColAnalysis):\n",
    "    provides_summary = [\"skew\"]\n",
    "    requires_summary = []\n",
    "    \n",
    "    @staticmethod\n",
    "    def summary(sampled_ser, summary_ser, ser):\n",
    "        if pd.api.types.is_integer_dtype(sampled_ser):\n",
    "            return dict(skew=skew(sampled_ser.dropna().astype('int64')))\n",
    "        elif pd.api.types.is_float_dtype(sampled_ser):\n",
    "            return dict(skew=skew(sampled_ser.astype('float64')))\n",
    "        else:\n",
    "            return dict(skew=\"NA\")\n",
    "    #fixme\n",
    "    summary_stats_display = [\n",
    "        'dtype',\n",
    "        'length',\n",
    "        'nan_count',\n",
    "        'distinct_count',\n",
    "        'empty_count',\n",
    "        'empty_per',\n",
    "        'unique_per',\n",
    "        'nan_per',\n",
    "        'is_numeric',\n",
    "        'is_integer',\n",
    "        'is_datetime',\n",
    "        'mode',\n",
    "        'min',\n",
    "        'max',\n",
    "        'mean',\n",
    "        # we must add skew to the list of summary_stats_display, otherwise our new stat won't be displayed\n",
    "        'skew']\n",
    "w.add_analysis(Skew)\n",
    "w"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Basic Unit testing is built in\n",
    "\n",
    "Because there are so many corner cases with numerical code, every time a new summary stat is added, a variety of simple tests are run against it.  This lets you discover bugs earlier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "w = BuckarooWidget(df[:500], showCommands=False)\n",
    "# Here we try to replace min_digits with a version that throws errors on bools.\n",
    "# Small error, but representantive of the types of erros that we see in the wild.\n",
    "def int_digits(n):\n",
    "    if np.isnan(n):\n",
    "        return 1\n",
    "    if n == 0:\n",
    "        return 1\n",
    "    if np.sign(n) == -1:\n",
    "        return int(np.floor(np.log10(np.abs(n)))) + 2\n",
    "    return int(np.floor(np.log10(n)+1))\n",
    "\n",
    "class MinDigits(ColAnalysis):\n",
    "    requires_summary = [\"min\"]\n",
    "    provides_summary = [\"min_digits\"]\n",
    "    \n",
    "    @staticmethod\n",
    "    def summary(sampled_ser, summary_ser, ser):\n",
    "        is_numeric = pd.api.types.is_numeric_dtype(sampled_ser.dtype)\n",
    "        if is_numeric:\n",
    "            return {\n",
    "                'min_digits':int_digits(summary_ser.loc['min'])}\n",
    "        else:\n",
    "            return {\n",
    "                'min_digits':0}\n",
    "w.add_analysis(MinDigits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#now we can use the reproduce line to quickly try this and iterate until we get to a solution\n",
    "from buckaroo.analysis_management import PERVERSE_DF\n",
    "MinDigits.summary(PERVERSE_DF['UInt8None'], pd.Series({'min': pd.NA, }), PERVERSE_DF['UInt8None'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# use quiet = True if you just want the analysis to shutup and work for the most part.\n",
    "# not generally recommended, but expedient.  If this wasn't built in, users would write\n",
    "# this functionality again over and over, poorly\n",
    "\n",
    "class MinDigits(ColAnalysis):\n",
    "    requires_summary = [\"min\"]\n",
    "    provides_summary = [\"min_digits\"]\n",
    "    quiet = True\n",
    "    \n",
    "    @staticmethod\n",
    "    def summary(sampled_ser, summary_ser, ser):\n",
    "        is_numeric = pd.api.types.is_numeric_dtype(sampled_ser.dtype)\n",
    "        if is_numeric:\n",
    "            return {\n",
    "                'min_digits':int_digits(summary_ser.loc['min'])}\n",
    "        else:\n",
    "            return {\n",
    "                'min_digits':0}\n",
    "w.add_analysis(MinDigits)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Making a new default dataframe display function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from buckaroo.widget_utils import disable\n",
    "from IPython.core.getipython import get_ipython\n",
    "from IPython.display import display\n",
    "import warnings\n",
    "\n",
    "disable()\n",
    "def my_display_as_buckaroo(df):\n",
    "    w  = BuckarooWidget(df, showCommands=False)\n",
    "    #the analysis we added throws warnings, let's muffle that when used as the default display\n",
    "    warnings.filterwarnings('ignore')\n",
    "    w.add_analysis(Skew)\n",
    "    warnings.filterwarnings('default')\n",
    "    return display(w)\n",
    "\n",
    "def my_enable():\n",
    "    \"\"\"\n",
    "    Automatically use buckaroo to display all DataFrames\n",
    "    instances in the notebook.\n",
    "\n",
    "    \"\"\"\n",
    "    ip = get_ipython()\n",
    "    if ip is None:\n",
    "        print(\"must be running inside ipython to enable default display via enable()\")\n",
    "        return\n",
    "    ip_formatter = ip.display_formatter.ipython_display_formatter\n",
    "    ip_formatter.for_type(pd.DataFrame, my_display_as_buckaroo)\n",
    "my_enable()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Adding a Command to the Low Code UI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "w = BuckarooWidget(df[:500])\n",
    "w"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from buckaroo.customizations.all_transforms import Command\n",
    "from buckaroo.jlisp.lisp_utils import s\n",
    "w = BuckarooWidget(df[:500])\n",
    "#Here we start adding commands to the Buckaroo Widget.  Every call to add_command replaces a command with the same name\n",
    "@w.add_command\n",
    "class GroupBy2(Command):\n",
    "    command_default = [s(\"groupby2\"), s('df'), 'col', {}]\n",
    "    command_pattern = [[3, 'colMap', 'colEnum', ['null', 'sum', 'mean', 'median', 'count']]]\n",
    "    @staticmethod \n",
    "    def transform(df, col, col_spec):\n",
    "        grps = df.groupby(col)\n",
    "        df_contents = {}\n",
    "        for k, v in col_spec.items():\n",
    "            if v == \"sum\":\n",
    "                df_contents[k] = grps[k].apply(lambda x: x.sum())\n",
    "            elif v == \"mean\":\n",
    "                df_contents[k] = grps[k].apply(lambda x: x.mean())\n",
    "            elif v == \"median\":\n",
    "                df_contents[k] = grps[k].apply(lambda x: x.median())\n",
    "            elif v == \"count\":\n",
    "                df_contents[k] = grps[k].apply(lambda x: x.count())\n",
    "        return pd.DataFrame(df_contents)\n",
    "\n",
    "    @staticmethod \n",
    "    def transform_to_py(df, col, col_spec):\n",
    "        commands = [\n",
    "            \"    grps = df.groupby('%s')\" % col,\n",
    "            \"    df_contents = {}\"\n",
    "        ]\n",
    "        for k, v in col_spec.items():\n",
    "            if v == \"sum\":\n",
    "                commands.append(\"    paddydf_contents['%s'] = grps['%s'].apply(lambda x: x.sum())\" % (k, k))\n",
    "            elif v == \"mean\":\n",
    "                commands.append(\"    df_contents['%s'] = grps['%s'].apply(lambda x: x.mean())\" % (k, k))\n",
    "            elif v == \"median\":\n",
    "                commands.append(\"    df_contents['%s'] = grps['%s'].apply(lambda x: x.median())\" % (k, k))\n",
    "            elif v == \"count\":\n",
    "                commands.append(\"    df_contents['%s'] = grps['%s'].apply(lambda x: x.count())\" % (k, k))\n",
    "        commands.append(\"    df = pd.DataFrame(df_contents)\")\n",
    "        return \"\\n\".join(commands)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that `groupby2` has been added to the commands"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "w.ac_obj."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
